import pandas as pd;
import pingouin as pg;
import seaborn as sns;
import scipy.stats as stats;
import numpy as np
import matplotlib.pyplot as plt
import scikit_posthocs as scp
import statsmodels as st
import operator
Función que nos permite obtener el ranking utilizando el test no paramétrico de wilcoxon.
def pairwise_test(df, parametric = True, method = 'bonferroni', decreasing = True, ties=True):
# Función que devuelve la matriz de rankings.
# El dataframe de entrada df debe tener los modelos en las columnas y
# las medidas en las filas.
# parametric = indica si se aplica un test paramétrico (ttest_rel) o no paramétrico (wilcoxon)
# decreasing = indica el sentido en el que es mejor la medida utilizada.
# ties = indca si se incluyen los empates en el resultado final
if decreasing:
relate = operator.lt
else:
relate = operator.gt
if parametric:
test = stats.ttest_rel
else:
test = stats.wilcoxon
cols = df.columns
pvalues = np.array([])
significant = np.array([])
ranks = pd.DataFrame({"wins":np.zeros(len(cols)),
"ties":np.zeros(len(cols)),
"losses":np.zeros(len(cols))})
ranks.index= cols
for index1 in range(len(cols)):
for index2 in range(index1+1,len(cols)):
T,p = test(df.iloc[:,index1],df.iloc[:,index2])
pvalues = np.append(pvalues,p)
significant,padjust,a1,a2 = st.stats.multitest.multipletests(pvalues,alpha = 0.05,method = method)
pos = 0;
for index1 in range(len(cols)):
for index2 in range(index1+1,len(cols)):
if not significant[pos]:
ranks.iloc[index1,1] = ranks.iloc[index1,1]+1
ranks.iloc[index2,1] = ranks.iloc[index2,1]+1
else:
if relate(df.iloc[:,index1].mean(),df.iloc[:,index2].mean()):
ranks.iloc[index1,0] = ranks.iloc[index1,0]+1
ranks.iloc[index2,2] = ranks.iloc[index2,2]+1
else:
ranks.iloc[index2,0] = ranks.iloc[index2,0]+1
ranks.iloc[index1,2] = ranks.iloc[index1,2]+1
pos = pos + 1
ranks["diff"] = ranks.wins-ranks.losses
#ranks = ranks.sort_values(by="diff",ascending = False)
if not ties:
ranks.drop('ties',axis = 'columns', inplace = True)
return ranks.astype(int)
Lectura de la tabla con el RSME en cada fold
tabla_rsme = pd.read_csv('../Datos_preprocesados/RMSE_errores_train.csv')
tabla_rsme
| Pliegues | RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Fold 1 | 0.010250 | 0.012253 | 0.040040 | 0.021895 | 0.010777 | 0.012175 | 0.015468 | 0.017608 | 0.009779 | 0.011421 | 0.009565 | 0.011302 |
| 1 | Fold 2 | 0.003465 | 0.005035 | 0.006011 | 0.008859 | 0.005831 | 0.004163 | 0.006113 | 0.004916 | 0.003236 | 0.005702 | 0.005437 | 0.004665 |
| 2 | Fold 3 | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.007964 | 0.006088 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| 3 | Fold 4 | 0.005817 | 0.005213 | 0.008027 | 0.008158 | 0.006893 | 0.007117 | 0.012659 | 0.018517 | 0.005418 | 0.005386 | 0.006962 | 0.007347 |
| 4 | Fold 5 | 0.019228 | 0.018612 | 0.015113 | 0.017661 | 0.014720 | 0.014294 | 0.018877 | 0.022049 | 0.018109 | 0.019953 | 0.015303 | 0.015526 |
| 5 | Fold 6 | 0.097717 | 0.099564 | 0.030118 | 0.027917 | 0.031447 | 0.029454 | 0.031626 | 0.026445 | 0.094637 | 0.099538 | 0.031492 | 0.028937 |
| 6 | Fold 7 | 0.054043 | 0.054168 | 0.047621 | 0.046455 | 0.048174 | 0.044944 | 0.050317 | 0.046399 | 0.049422 | 0.054665 | 0.048253 | 0.045141 |
| 7 | Fold 8 | 0.117732 | 0.113723 | 0.064359 | 0.061369 | 0.063967 | 0.061773 | 0.066249 | 0.059950 | 0.118328 | 0.118405 | 0.064465 | 0.062454 |
| 8 | Fold 9 | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
| 9 | Fold 10 | 0.060135 | 0.064386 | 0.029916 | 0.038327 | 0.029938 | 0.036199 | 0.029868 | 0.036866 | 0.048429 | 0.046525 | 0.029645 | 0.035670 |
Se elimina la columna con los pliegues porque esa información está en el índice
tabla_rsme.drop(columns = "Pliegues", inplace=True)
tabla_rsme.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.049563 | 0.051771 | 0.032960 | 0.032719 | 0.029975 | 0.030343 | 0.032815 | 0.033264 | 0.047135 | 0.050586 | 0.029860 | 0.030430 |
| std | 0.048881 | 0.051414 | 0.027178 | 0.027624 | 0.028047 | 0.028913 | 0.027536 | 0.027363 | 0.048149 | 0.052089 | 0.028119 | 0.028922 |
| min | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.006113 | 0.004916 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| 25% | 0.006925 | 0.006973 | 0.009799 | 0.011060 | 0.007864 | 0.008381 | 0.013362 | 0.017836 | 0.006509 | 0.007132 | 0.007613 | 0.008336 |
| 50% | 0.036635 | 0.036390 | 0.030017 | 0.024906 | 0.022329 | 0.021874 | 0.024372 | 0.024247 | 0.033269 | 0.033239 | 0.022474 | 0.022231 |
| 75% | 0.088322 | 0.090770 | 0.045726 | 0.044423 | 0.043993 | 0.042758 | 0.045644 | 0.044016 | 0.083333 | 0.088320 | 0.044063 | 0.042774 |
| max | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
Mostramos un diagrama de cajas para cada uno de los modelos
import plotly.express as px
from flask import Flask
import flask
fig = px.box(tabla_rsme, y = tabla_rsme.columns)
flask.Markup(fig)
fig.show()
tabla_rsme.boxplot()
<AxesSubplot:>
Primero analizamos si las diferentes muestras (resultados de la validación cruzada por cada modelo) proceden de una distribución normal. Para ello vamos a aplicar el test de Shapiro-Wilks.
tabla_rsme.apply(stats.shapiro)
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.848849 | 0.865220 | 0.926488 | 0.894526 | 0.881280 | 0.884258 | 0.874022 | 0.886963 | 0.833152 | 0.855404 | 0.880959 | 0.884850 |
| 1 | 0.056286 | 0.087895 | 0.414248 | 0.190580 | 0.134983 | 0.145985 | 0.111330 | 0.156691 | 0.036515 | 0.067337 | 0.133844 | 0.148267 |
Como pdemos ver, exite un pvalue que es menor que el nivel de significancia del 0.05, GBR_lag3 con lo que en ese caso la distribución no sigue una distribución normal. Esto lo podemos ver de forma gráfica.
f = plt.figure()
plt.title("Distribuiciones de los resultados RMSE")
tabla_rsme.plot(kind='density', ax=f.gca())
f.gca().set_xlabel('RSME')
plt.legend(loc='right', bbox_to_anchor=(1.75,0.5))
<matplotlib.legend.Legend at 0x1d2fa85c940>
test = stats.friedmanchisquare(*[tabla_rsme[col] for col in tabla_rsme.columns])
test
FriedmanchisquareResult(statistic=22.907692307692344, pvalue=0.018212427063207203)
Como se puede ver no se detectan diferencias signficativas. Si intentamos aplicar el test de Nemenyi tampoco las encuentra.
scp.posthoc_nemenyi_friedman(tabla_rsme, melted=False)
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RF_lag3 | 1.000000 | 0.900000 | 0.9 | 0.900000 | 0.833739 | 0.599927 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.677865 | 0.638894 |
| RF_lag5 | 0.900000 | 1.000000 | 0.9 | 0.900000 | 0.440001 | 0.212062 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.277058 | 0.242858 |
| LR_lag3 | 0.900000 | 0.900000 | 1.0 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.900000 | 0.900000 |
| LR_lag5 | 0.900000 | 0.900000 | 0.9 | 1.000000 | 0.900000 | 0.794768 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.872707 | 0.833739 |
| Lasso_lag3 | 0.833739 | 0.440001 | 0.9 | 0.900000 | 1.000000 | 0.900000 | 0.599927 | 0.900000 | 0.9 | 0.560957 | 0.900000 | 0.900000 |
| Lasso_lag5 | 0.599927 | 0.212062 | 0.9 | 0.794768 | 0.900000 | 1.000000 | 0.353264 | 0.872707 | 0.9 | 0.313427 | 0.900000 | 0.900000 |
| SVR_lag3 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.599927 | 0.353264 | 1.000000 | 0.900000 | 0.9 | 0.900000 | 0.440001 | 0.396312 |
| SVR_lag5 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.900000 | 0.872707 | 0.900000 | 1.000000 | 0.9 | 0.900000 | 0.900000 | 0.900000 |
| GBR_lag3 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 1.0 | 0.900000 | 0.900000 | 0.900000 |
| GBR_lag5 | 0.900000 | 0.900000 | 0.9 | 0.900000 | 0.560957 | 0.313427 | 0.900000 | 0.900000 | 0.9 | 1.000000 | 0.396312 | 0.353264 |
| EN_lag3 | 0.677865 | 0.277058 | 0.9 | 0.872707 | 0.900000 | 0.900000 | 0.440001 | 0.900000 | 0.9 | 0.396312 | 1.000000 | 0.900000 |
| EN_lag5 | 0.638894 | 0.242858 | 0.9 | 0.833739 | 0.900000 | 0.900000 | 0.396312 | 0.900000 | 0.9 | 0.353264 | 0.900000 | 1.000000 |
Para facilitar el cáluclo de la matriz de ranking, vamos a utilizar la función que definimos antes que utiliza el test de Wilcoxon.
rsme_ranks = pairwise_test(tabla_rsme, parametric=False, decreasing=True, ties= False, method="holm")
rsme_ranks
| wins | losses | diff | |
|---|---|---|---|
| RF_lag3 | 0 | 0 | 0 |
| RF_lag5 | 0 | 0 | 0 |
| LR_lag3 | 0 | 0 | 0 |
| LR_lag5 | 0 | 0 | 0 |
| Lasso_lag3 | 0 | 0 | 0 |
| Lasso_lag5 | 0 | 0 | 0 |
| SVR_lag3 | 0 | 0 | 0 |
| SVR_lag5 | 0 | 0 | 0 |
| GBR_lag3 | 0 | 0 | 0 |
| GBR_lag5 | 0 | 0 | 0 |
| EN_lag3 | 0 | 0 | 0 |
| EN_lag5 | 0 | 0 | 0 |
tabla_mae = pd.read_csv('../Datos_preprocesados/MAE_errores_train.csv')
tabla_mae
| Pliegues | RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Fold 1 | 0.008462 | 0.011392 | 0.032479 | 0.014992 | 0.007227 | 0.007082 | 0.012623 | 0.017494 | 0.007774 | 0.009879 | 0.006476 | 0.005981 |
| 1 | Fold 2 | 0.003400 | 0.005021 | 0.004186 | 0.003734 | 0.004024 | 0.002734 | 0.005366 | 0.003984 | 0.003021 | 0.005115 | 0.003658 | 0.003443 |
| 2 | Fold 3 | 0.001714 | 0.002173 | 0.001866 | 0.003266 | 0.001329 | 0.001685 | 0.005990 | 0.004277 | 0.001440 | 0.001243 | 0.001306 | 0.001632 |
| 3 | Fold 4 | 0.002247 | 0.002934 | 0.003994 | 0.002763 | 0.002256 | 0.002159 | 0.007597 | 0.010134 | 0.001828 | 0.001652 | 0.001959 | 0.002001 |
| 4 | Fold 5 | 0.012167 | 0.013229 | 0.011721 | 0.007150 | 0.011192 | 0.012083 | 0.014557 | 0.018390 | 0.012484 | 0.013561 | 0.012501 | 0.012333 |
| 5 | Fold 6 | 0.084448 | 0.087158 | 0.016943 | 0.016096 | 0.017917 | 0.016959 | 0.021310 | 0.012787 | 0.082231 | 0.087985 | 0.020494 | 0.016918 |
| 6 | Fold 7 | 0.033824 | 0.033456 | 0.026278 | 0.026127 | 0.025823 | 0.027261 | 0.034723 | 0.028557 | 0.022534 | 0.032382 | 0.026455 | 0.025546 |
| 7 | Fold 8 | 0.065309 | 0.056947 | 0.036142 | 0.032730 | 0.035894 | 0.033170 | 0.040139 | 0.031767 | 0.074404 | 0.061051 | 0.037323 | 0.032217 |
| 8 | Fold 9 | 0.098532 | 0.118379 | 0.052798 | 0.042889 | 0.055291 | 0.043395 | 0.059637 | 0.043899 | 0.099885 | 0.115925 | 0.054401 | 0.042336 |
| 9 | Fold 10 | 0.043867 | 0.055397 | 0.020134 | 0.024806 | 0.020085 | 0.023185 | 0.018272 | 0.020647 | 0.030652 | 0.040988 | 0.019380 | 0.022019 |
tabla_mae.drop(columns = "Pliegues", inplace=True)
fig = px.box(tabla_mae, y = tabla_mae.columns)
fig.show()
tabla_mae.boxplot()
<AxesSubplot:>
tabla_mae.apply(stats.shapiro)
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.859676 | 0.862087 | 0.935417 | 0.912743 | 0.886039 | 0.917546 | 0.869829 | 0.946054 | 0.813711 | 0.859454 | 0.894715 | 0.919158 |
| 1 | 0.075641 | 0.080753 | 0.503236 | 0.300369 | 0.152955 | 0.336945 | 0.099507 | 0.622112 | 0.021268 | 0.075185 | 0.191507 | 0.349987 |
f = plt.figure()
plt.title("Distribuiciones de los resultados de la validación cruzada")
tabla_mae.plot(kind='density', ax=f.gca())
f.gca().set_xlabel('MAE')
plt.legend(loc='right', bbox_to_anchor=(1.5,0.5))
<matplotlib.legend.Legend at 0x1d2fa6536d0>
Al igual que antes he hecho el test en R y se rechaza la hipótesis nula de que se cumple la esfericidad
test = stats.friedmanchisquare(*[tabla_mae[col] for col in tabla_mae.columns])
test
FriedmanchisquareResult(statistic=34.584615384615404, pvalue=0.0002900649084931501)
scp.posthoc_nemenyi_friedman(tabla_mae, melted=False)
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RF_lag3 | 1.000000 | 0.900000 | 0.900000 | 0.900000 | 0.716832 | 0.599927 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.716832 | 0.183504 |
| RF_lag5 | 0.900000 | 1.000000 | 0.638894 | 0.183504 | 0.068963 | 0.038871 | 0.900000 | 0.872707 | 0.440001 | 0.900000 | 0.068963 | 0.003226 |
| LR_lag3 | 0.900000 | 0.638894 | 1.000000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.677865 |
| LR_lag5 | 0.900000 | 0.183504 | 0.900000 | 1.000000 | 0.900000 | 0.900000 | 0.440001 | 0.900000 | 0.900000 | 0.833739 | 0.900000 | 0.900000 |
| Lasso_lag3 | 0.716832 | 0.068963 | 0.900000 | 0.900000 | 1.000000 | 0.900000 | 0.212062 | 0.900000 | 0.900000 | 0.599927 | 0.900000 | 0.900000 |
| Lasso_lag5 | 0.599927 | 0.038871 | 0.900000 | 0.900000 | 0.900000 | 1.000000 | 0.135020 | 0.872707 | 0.900000 | 0.482355 | 0.900000 | 0.900000 |
| SVR_lag3 | 0.900000 | 0.900000 | 0.900000 | 0.440001 | 0.212062 | 0.135020 | 1.000000 | 0.900000 | 0.716832 | 0.900000 | 0.212062 | 0.016850 |
| SVR_lag5 | 0.900000 | 0.872707 | 0.900000 | 0.900000 | 0.900000 | 0.872707 | 0.900000 | 1.000000 | 0.900000 | 0.900000 | 0.900000 | 0.440001 |
| GBR_lag3 | 0.900000 | 0.440001 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.716832 | 0.900000 | 1.000000 | 0.900000 | 0.900000 | 0.872707 |
| GBR_lag5 | 0.900000 | 0.900000 | 0.900000 | 0.833739 | 0.599927 | 0.482355 | 0.900000 | 0.900000 | 0.900000 | 1.000000 | 0.599927 | 0.115746 |
| EN_lag3 | 0.716832 | 0.068963 | 0.900000 | 0.900000 | 0.900000 | 0.900000 | 0.212062 | 0.900000 | 0.900000 | 0.599927 | 1.000000 | 0.900000 |
| EN_lag5 | 0.183504 | 0.003226 | 0.677865 | 0.900000 | 0.900000 | 0.900000 | 0.016850 | 0.440001 | 0.872707 | 0.115746 | 0.900000 | 1.000000 |
mae_ranks = pairwise_test(tabla_mae,decreasing = True,parametric=False, ties=False).sort_index()
mae_ranks
| wins | losses | diff | |
|---|---|---|---|
| EN_lag3 | 0 | 0 | 0 |
| EN_lag5 | 0 | 0 | 0 |
| GBR_lag3 | 0 | 0 | 0 |
| GBR_lag5 | 0 | 0 | 0 |
| LR_lag3 | 0 | 0 | 0 |
| LR_lag5 | 0 | 0 | 0 |
| Lasso_lag3 | 0 | 0 | 0 |
| Lasso_lag5 | 0 | 0 | 0 |
| RF_lag3 | 0 | 0 | 0 |
| RF_lag5 | 0 | 0 | 0 |
| SVR_lag3 | 0 | 0 | 0 |
| SVR_lag5 | 0 | 0 | 0 |
final_ranks = pd.concat([rsme_ranks,mae_ranks], axis=1)
final_ranks
| wins | losses | diff | wins | losses | diff | |
|---|---|---|---|---|---|---|
| RF_lag3 | 0 | 0 | 0 | 0 | 0 | 0 |
| RF_lag5 | 0 | 0 | 0 | 0 | 0 | 0 |
| LR_lag3 | 0 | 0 | 0 | 0 | 0 | 0 |
| LR_lag5 | 0 | 0 | 0 | 0 | 0 | 0 |
| Lasso_lag3 | 0 | 0 | 0 | 0 | 0 | 0 |
| Lasso_lag5 | 0 | 0 | 0 | 0 | 0 | 0 |
| SVR_lag3 | 0 | 0 | 0 | 0 | 0 | 0 |
| SVR_lag5 | 0 | 0 | 0 | 0 | 0 | 0 |
| GBR_lag3 | 0 | 0 | 0 | 0 | 0 | 0 |
| GBR_lag5 | 0 | 0 | 0 | 0 | 0 | 0 |
| EN_lag3 | 0 | 0 | 0 | 0 | 0 | 0 |
| EN_lag5 | 0 | 0 | 0 | 0 | 0 | 0 |
tabla_R2 = pd.read_csv('../Datos_preprocesados/R2_errores_train.csv')
tabla_R2
tabla_R2.drop(columns = "Pliegues", inplace=True)
tabla_R2.boxplot()
tabla_R2.apply(stats.shapiro)
f = plt.figure()
plt.title("Distribuiciones de los resultados de la validación cruzada")
tabla_R2.plot(kind='density', ax=f.gca())
f.gca().set_xlabel('R2')
plt.legend(loc='right', bbox_to_anchor=(1.75,0.5))
En este caso podemos ver que las distribuciones proceden de una distribución normal. Esto nos lleva a comprobar lo otra condición de parametricida, la homocedasticidad o la esfericidad. Para ello vamos a aplicar el test de Mauchly
test = stats.friedmanchisquare(*[tabla_R2[col] for col in tabla_R2.columns])
test
scp.posthoc_nemenyi_friedman(tabla_R2, melted=False)
De la mismoa forma que hemos hecho para el caso no paramétrico, para automatizar los cálculos vamos a aplicar el T-test con la corrección de Bonferroni.
cc_ranks = pairwise_test(tabla_R2,parametric=False,decreasing=False,ties=False).sort_index()
cc_ranks
final_ranks = pd.concat([rsme_ranks,mae_ranks,cc_ranks], axis=1)
final_ranks
#shopper_rank.to_csv('../Data_Produced/shopper_rank_ranks.csv')
#shopper_rank.to_latex('../Data_Produced/shopper_rank_ranks.tex')
Lectura de la tabla
tabla_cc = pd.read_csv('../Datos_preprocesados/CC_train.csv')
tabla_cc
| Pliegues | RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Fold 1 | 0.660 | 0.851 | 0.396 | 0.465 | 0.431 | 0.433 | 0.485 | 0.465 | 0.592 | 0.875 | 0.458 | 0.450 |
| 1 | Fold 2 | 0.725 | 0.647 | 0.065 | -0.042 | 0.160 | 0.047 | 0.245 | -0.009 | 0.269 | 0.734 | 0.202 | 0.109 |
| 2 | Fold 3 | 0.866 | 0.907 | 0.467 | 0.556 | 0.486 | 0.571 | 0.491 | 0.553 | 0.646 | 0.895 | 0.497 | 0.571 |
| 3 | Fold 4 | 0.926 | 0.934 | 0.337 | 0.433 | 0.320 | 0.445 | 0.336 | 0.535 | 0.871 | 0.942 | 0.315 | 0.442 |
| 4 | Fold 5 | 0.863 | 0.889 | 0.568 | 0.601 | 0.561 | 0.602 | 0.689 | 0.748 | 0.786 | 0.981 | 0.571 | 0.598 |
| 5 | Fold 6 | 0.928 | 0.932 | 0.793 | 0.803 | 0.796 | 0.803 | 0.824 | 0.826 | 0.938 | 0.985 | 0.796 | 0.802 |
| 6 | Fold 7 | 0.837 | 0.910 | 0.517 | 0.583 | 0.508 | 0.600 | 0.477 | 0.690 | 0.909 | 0.976 | 0.501 | 0.598 |
| 7 | Fold 8 | 0.943 | 0.944 | 0.888 | 0.913 | 0.887 | 0.910 | 0.860 | 0.873 | 0.966 | 0.981 | 0.884 | 0.907 |
| 8 | Fold 9 | 0.933 | 0.948 | 0.864 | 0.885 | 0.862 | 0.879 | 0.817 | 0.811 | 0.989 | 0.981 | 0.861 | 0.879 |
| 9 | Fold 10 | 0.804 | 0.782 | 0.620 | 0.586 | 0.609 | 0.591 | 0.621 | 0.638 | 0.844 | 0.968 | 0.608 | 0.587 |
tabla_cc.drop(columns = "Pliegues", inplace=True)
tabla_cc.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.848500 | 0.874400 | 0.551500 | 0.578300 | 0.562000 | 0.588100 | 0.584500 | 0.613000 | 0.781000 | 0.931800 | 0.569300 | 0.594300 |
| std | 0.095342 | 0.094558 | 0.255903 | 0.274144 | 0.235375 | 0.252187 | 0.212598 | 0.257212 | 0.222787 | 0.079698 | 0.225801 | 0.236204 |
| min | 0.660000 | 0.647000 | 0.065000 | -0.042000 | 0.160000 | 0.047000 | 0.245000 | -0.009000 | 0.269000 | 0.734000 | 0.202000 | 0.109000 |
| 25% | 0.812250 | 0.860500 | 0.413750 | 0.487750 | 0.444750 | 0.476500 | 0.479000 | 0.539500 | 0.681000 | 0.906750 | 0.467750 | 0.480250 |
| 50% | 0.864500 | 0.908500 | 0.542500 | 0.584500 | 0.534500 | 0.595500 | 0.556000 | 0.664000 | 0.857500 | 0.972000 | 0.536000 | 0.592500 |
| 75% | 0.927500 | 0.933500 | 0.749750 | 0.752500 | 0.749250 | 0.752750 | 0.785000 | 0.795250 | 0.930750 | 0.981000 | 0.749000 | 0.751000 |
| max | 0.943000 | 0.948000 | 0.888000 | 0.913000 | 0.887000 | 0.910000 | 0.860000 | 0.873000 | 0.989000 | 0.985000 | 0.884000 | 0.907000 |
import plotly.graph_objects as go
fig = go.Figure(
data=[go.box(y=[2, 1, 3])],
layout_title_text="A Figure Displaying Itself"
)
fig
from flask import Markup
plotly.offline.plot
fig = px.box(tabla_rsme, y = tabla_rsme.columns)
Markup(fig)
fig.show()
tabla_cc.apply(stats.shapiro)
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.880652 | 0.771550 | 0.958121 | 0.887266 | 0.957226 | 0.912596 | 0.934377 | 0.846537 | 0.846381 | 0.716735 | 0.948635 | 0.923018 |
| 1 | 0.132762 | 0.006532 | 0.764250 | 0.157936 | 0.753839 | 0.299302 | 0.492296 | 0.052824 | 0.052599 | 0.001406 | 0.652372 | 0.382816 |
La distribución para RF_lag5 no es normal. Tampoco para GBR_Lag5
f = plt.figure()
plt.title("Distribuiciones de los resultados CC")
tabla_cc.plot(kind='density', ax=f.gca())
f.gca().set_xlabel('CC')
plt.legend(loc='right', bbox_to_anchor=(1.75,0.5))
test = stats.friedmanchisquare(*[tabla_rsme[col] for col in tabla_rsme.columns])
test
Sin diferencias significativas. Probamos con Nemenyi
scp.posthoc_nemenyi_friedman(tabla_cc, melted=False)
cc_ranks = pairwise_test(tabla_rsme, parametric=False, decreasing=False, ties= False, method="holm")
cc_ranks
final_ranks = pd.concat([rsme_ranks,mae_ranks,cc_ranks], axis=1)
final_ranks